Randomly selecting files for research

This code will randomly select 1000 known-good files and put them in a folder for feature extraction. It will also gather 1000 malware files and put them in a folder for feature extraction. The "known-good" files are sourced from a virtual machine running on Parallels that has Windows 8 sp1 installed along with Microsoft Office, and various other programs typically found on an office computer.

Assumptions and limitations

The computer that is sourcing the benign files only contains files that are not malware. I am unable to prove that my source machine is not loaded up with super-secret OMG 中国网軍 malware. I am also not able to prove that the malware sample set I'll be examining does not contain the occasional benign file that snuck in there somewhere.


In [2]:
import os
import random
import shutil
import uuid
import pefile
from pefile import PEFormatError
path_to_windows_root = "/Volumes/C"
path_to_malware = "/Users/v527234/Documents/malware"
random.seed("Follow @bfist on twitter, fool!")
OVERWRITE = True
exclude_dirs = ['WinSxS','Installer']

In [5]:
def peFiles(inList):
    random.shuffle(inList)
    for eachFile in inList:
        try:
            pe = pefile.PE(eachFile)
            len(pe.DIRECTORY_ENTRY_IMPORT)
            yield eachFile   # if the file was parsable, send that file name
        except PEFormatError:
            next
        except AttributeError:
            next

In [3]:
benign_pe_files = []
for root, dirs, files in os.walk(path_to_windows_root):
    for file in files:
        if file.endswith('.exe'):
            all_pe_files.append(os.path.join(root, file))

In [17]:
malware_pe_files = []
for root, dirs, files in os.walk(path_to_malware):
    for file in files:
        if file.startswith('VirusShare'):
            malware_pe_files.append(os.path.join(root, file))

In [18]:
good_pe_files = peFiles(benign_pe_files)
bad_pe_files = peFiles(malware_pe_files)

In [11]:
if OVERWRITE:
    for dirName in ['train','test']:
        try:
            os.mkdir(dirName)
            print "Created folder %s." % dirName
            for subdir in ['benign','malicious']:
                os.mkdir(os.path.join(dirName,subdir))
        except OSError as e:
            if e[1] == "File exists":
                print "%s already exists. Deleting its files" % dirName
                for each in os.listdir(dirName):
                    os.remove(os.path.join(dirName,each))


benign already exists. Deleting its files
malicious already exists. Deleting its files

In [12]:
if OVERWRITE:
    for i in range(600):
        eachFile = next(good_pe_files)
        shutil.copyfile(eachFile, 'train/benign/'+str(uuid.uuid4())+ '-' + eachFile.replace('/','-').replace(' ','_'))
    for i in range(400):
        eachFile = next(good_pe_files)
        shutil.copyfile(eachFile, 'train/benign/'+str(uuid.uuid4())+ '-' + eachFile.replace('/','-').replace(' ','_'))

In [21]:
if OVERWRITE:
    for i in range(600):
        eachFile = next(bad_pe_files)
        shutil.copyfile(eachFile, "train/malicious/"+os.path.basename(eachFile)+".exe")
    for i in range(400):
        eachFile = next(bad_pe_files)
        shutil.copyfile(eachFile, "test/malicious/"+os.path.basename(eachFile)+".exe")

In [ ]:


In [ ]: